In [ ]:
%%sql -d standard
SELECT
id, title, body, view_count, tags
FROM
`gcp-samples2.stackoverflow_demo.top100K_posts`
ORDER BY
view_count DESC
LIMIT
5
In [ ]:
%%sql -d standard
CREATE TEMPORARY FUNCTION segmentation(body STRING)
RETURNS ARRAY<STRING>
LANGUAGE js AS """
// remove HTML tags, convert to lowercase and extract words
return body.replace(/(<[^>]+>|&#[^;]+;)/g, '').toLowerCase().match(/\\w\\w+/g);
""";
SELECT
id,
segmentation(CONCAT(title, body, tags)) as words
FROM
`gcp-samples2.stackoverflow_demo.top100K_posts`
LIMIT
5
# (for full query on 10M posts takes about 30 secs with 14.1GB)
In [ ]:
%%sql -d standard
CREATE TEMPORARY FUNCTION calc_tf_idf(words ARRAY<STRING>)
RETURNS STRING
LANGUAGE js AS """
// count each word in this post
var word_count = new Object();
for (word of words) {
if (word_count[word]) {
word_count[word]++;
} else {
word_count[word] = 1;
}
}
// calculate TF-IDF values for each word
// tf = word count / total word count
// idf = log (100K posts / # of posts with the word)
// tf_idf = tf * idf
var total_posts = 100000;
var words_in_post = words.length;
var tf_idf = new Object();
var norm_sum = 0;
for (word in word_count) {
if (word_dict[word] == null) {
word_dict[word] = 1;
}
var tf = (word_count[word] / words_in_post);
var idf = Math.log(total_posts / word_dict[word]);
tf_idf[word] = tf * idf;
norm_sum += tf_idf[word]^2;
}
// normarizing TF-IDF values with L2 norm
for (word in tf_idf) {
tf_idf[word] = (tf_idf[word] / Math.sqrt(norm_sum)).toFixed(5);
if (tf_idf[word] < 0.01) {
delete tf_idf[word]; // remove trivial words
}
}
return JSON.stringify(tf_idf);
"""
OPTIONS (
library="gs://gcp-samples2-stackoverflow/word_dict_0.js",
library="gs://gcp-samples2-stackoverflow/word_dict_1.js",
library="gs://gcp-samples2-stackoverflow/word_dict_2.js",
library="gs://gcp-samples2-stackoverflow/word_dict_3.js",
library="gs://gcp-samples2-stackoverflow/word_dict_4.js",
library="gs://gcp-samples2-stackoverflow/word_dict_5.js",
library="gs://gcp-samples2-stackoverflow/word_dict_6.js"
);
SELECT
id, calc_tf_idf(words) AS tf_idf
FROM
`gcp-samples2.stackoverflow_demo.top100K_posts_segmented` AS posts
LIMIT
10
# (for full query on 10M posts takes 50 secs with 12.2GB)
In [ ]:
%%sql -d standard
CREATE TEMPORARY FUNCTION calc_similarity(tf_idf_json_0 STRING, tf_idf_json_1 STRING)
RETURNS FLOAT64
LANGUAGE js AS """
// parse JSON to extract tf_idf
var tf_idf_0 = JSON.parse(tf_idf_json_0);
var tf_idf_1 = JSON.parse(tf_idf_json_1);
// calculate cosine similarity
var similarity = 0;
for (word in tf_idf_0) {
var t0 = tf_idf_0[word] ? Number(tf_idf_0[word]) : 0;
var t1 = tf_idf_1[word] ? Number(tf_idf_1[word]) : 0;
similarity += t0 * t1;
}
return similarity;
""";
SELECT
title,
body,
tags,
similarity
FROM
(
SELECT
t1.id,
calc_similarity(tf_idf_0, t1.tf_idf) AS similarity
FROM
(
SELECT tf_idf AS tf_idf_0
FROM `gcp-samples2.stackoverflow_demo.top100K_posts_tf_idf` AS t0
WHERE id = 92082
)
CROSS JOIN
`gcp-samples2.stackoverflow_demo.top100K_posts_tf_idf` AS t1
ORDER BY
similarity DESC
LIMIT
10
)
JOIN
`gcp-samples2.stackoverflow_demo.top100K_posts` AS t2
USING (id)
ORDER BY
similarity DESC
# (for full query on 10M posts takes about 16 secs with 16 GB)
# try other posts :
# 5585779 (String to Int in Java)
# 1789945 (substring in JS)
# 92082 (add a column in SQL Server)
# 503093 (redirecting in jQuery)
In [ ]: